## Transition probabilities of each age and income group to another income group after 5 years

# Preliminaries -----------------------------------------------------------

rm(list=ls())
gc()


## hilda file paths
hilda_qs_path <- "./HILDA Wave 19/qs files/"


## get file names of combined dta files
combined_files <- list.files(hilda_qs_path) %>% 
  str_subset("Combined_.+") %>% 
  str_remove(".qs")

## file name of HILDA longitudinal weights
long_weights_file <- list.files(hilda_qs_path) %>% 
  str_subset("longitudinal_weights_s190") %>% 
  str_remove(".qs")


## dataframe for matching year / wave ID / wave letter in HILDA
waves <- tibble(letter = letters[1:length(combined_files)], 
                year = c(2001:(2000+length(combined_files)))
) %>% 
  rowid_to_column(., "id")



# Read in HILDA grouped master ---------------------------------------

hilda_grouped <- qread("./Input data/Intermediate input data/hilda_grouped_master.qs")



# Get 5 year income group transitions --------------------------------------

## 5 years later, which income class are they in? 
hilda_inc_trans_all <- hilda_grouped %>% 
  ## what wavenumber is 5 years later?
  mutate(wavenumber_plus5 = wavenumber+5) %>% 
  ## do the following per person
  ## done this way rather than just get the observed income 5 observations later because some people might skip years but still observed in wave+5
  split(., .$xwaveid) %>% 
  ## the lapply below takes a while
  lapply( . ,
          ## for each xwaveid
          function(x) {
            ## what waves are they observed in?
            waveid_allwaves <- x$wavenumber
            
            ## what are all observed incomes?
            waveid_allincs <- x$total_inc_qtile
            
            ## are they observed in 5 years ?
            xx <- x %>% 
              mutate(waveid_obs5 = ifelse(wavenumber_plus5 %in% waveid_allwaves, 1, 0)) %>% 
              ## if yes, get the income for the wave 5 years later
              mutate(total_inc_qtile_lead5 = ifelse(waveid_obs5==1,
                                                    waveid_allincs[match(wavenumber_plus5, waveid_allwaves)],
                                                    NA) )
            
          }) %>% 
  rbindlist



# Weights for income transitions ---------------------

# by age, how does total income transition after 5 years?
# using 5-year paired longitudinal weights 

## first read in long weight data
hilda_long_weights <- qread(paste0(hilda_qs_path, long_weights_file, ".qs"))
#long_weight_varlabels <- var_label(hilda_long_weights) %>% unlist %>% as.data.frame()

## 5 year wave letter pairs
waves_pair5 <- waves %>% 
  mutate(letter_pair5 = paste0(letter, lead(letter,5)) %>% 
           ifelse(str_detect(., "NA"), NA, .) )

## get enumerated person weights for 5 year pairs in long format
long_weights_pair5 <- hilda_long_weights %>% 
  select(xwaveid, contains(paste0("wle", waves_pair5$letter_pair5)) ) %>% 
  mutate(xwaveid = as.numeric(xwaveid)) %>%  # make numeric to enable merging
  ## make the data long
  pivot_longer(cols = contains("wle"), names_to="weightname", values_to="wle") %>% 
  ## first wave number the weight applies to
  mutate(waveletter = str_sub(weightname, 4,4)) %>% 
  left_join(waves %>% select(wavenumber = id, waveletter = letter)) %>% 
  filter(wle!=0) ## wle is 0 if the obs has 0 weight


## merge weights with hilda_inc_trans_all data
hilda_inc_trans_w_weights <- hilda_inc_trans_all %>%
  ## convert xwaveid to numeric to allow merging w indiv wealth data
  mutate(xwaveid = as.numeric(xwaveid)) %>% 
  left_join(long_weights_pair5) 

## check that population totals match
# temp1 <- long_weights_pair5 %>% group_by(wavenumber) %>% summarise(n=sum(wle))
# temp2 <- hilda_data_orig %>% group_by(wavenumber) %>% summarise(n=sum(hhwte))
# temp3 <- hilda_inc_trans_w_weights %>% group_by(wavenumber) %>% summarise(n=sum(wle, na.rm=T))


# Income transitions by starting age group - weighted and summarised  ------------------------------------------

## summarise by group
hilda_inc_trans_age_summary5 <- hilda_inc_trans_w_weights %>% 
  group_by(wavenumber, age_grp, total_inc_qtile, total_inc_qtile_lead5) %>% 
  summarise(n = sum(wle, na.rm=T),
            samplefreq = n(),
            total_inc_mean = mean(total_inc)) %>% 
  filter(!is.na(total_inc_qtile_lead5)) %>% 
  ## calc probability of transitioning by wave, age and initial income qtile
  group_by(wavenumber, age_grp, total_inc_qtile) %>% 
  mutate(prob = n/sum(n))

## average probabilities across all waves
hilda_inc_trans_age_summary5_av <- hilda_inc_trans_age_summary5 %>% 
  group_by(age_grp, total_inc_qtile, total_inc_qtile_lead5) %>% 
  summarise(avprob = Hmisc::wtd.mean(prob, weights=n),
            avn = mean(n),
            sumsamplefreq = sum(samplefreq),
            total_inc_meanmean = mean(total_inc_mean)) %>% 
  ## rescale so probs sum to 1
  group_by(age_grp, total_inc_qtile) %>% 
  mutate(sumprob = sum(avprob),
         avprob = avprob/sumprob) %>% 
  select(-sumprob)


## save all transition probabilities. Model code later will determine which get used ie if only restricting inc transitions to certain age groups
qsave(hilda_inc_trans_age_summary5_av, "./Input data/income_transition_probs.qs")



## plot
# hilda_grossinc_trans_age_plot5 <- ggplot(hilda_inc_trans_age_summary5_av %>% filter(!is.na(avprob)),
#                                          aes(y = avprob, axis1 = total_inc_qtile, axis2 = total_inc_qtile_lead5)) +
#   geom_alluvium(aes(fill = total_inc_qtile)) +
#   geom_stratum(width = 1/12, fill= "black", col="white") +
#   geom_label(stat="stratum", aes(label = after_stat(stratum))) +
#   scale_x_discrete(limits = c("Original", "5 years later")) +
#   facet_wrap(vars(age_grp), ncol=3 , scales="free")






# # Check: Apply probabilities to 2018 data to examine probs ----------------------------------------
# 
# ## get wide version of transition probabilities
# hilda_inc_trans_age_summary5_av_wide <- hilda_inc_trans_age_summary5_av %>% 
#   select(age_grp, total_inc_qtile, total_inc_qtile_lead5, avprob) %>% 
#   pivot_wider(names_from = total_inc_qtile_lead5, values_from = avprob) %>% 
#   setNames( c( names(.)[1:2], paste0("lead5_", c(1:5)) )) %>% 
#   ## replace NAs with 0 
#   mutate( across(everything(), ~replace_na(.x, 0)))
# 
# ## attach to 2018 data
# hilda_inc_trans_age_summary5_w18 <- hilda_grouped %>% 
#   filter(wavenumber==18) %>% 
#   group_by(age_grp, total_inc_qtile) %>% 
#   ## get totals by group by summing wegihts
#   summarise(n=sum(hhwte)) %>% 
#   ## get transition probabilities
#   left_join(hilda_inc_trans_age_summary5_av_wide) %>% 
#   ## calculate how many ppl transition to each new quantile by multiplying probabilities with number of people
#   mutate( across(contains("lead5"), ~.x * n ) ) %>% 
#   select(-n) %>% 
#   ## convert to long format
#   pivot_longer(cols=contains("lead5"), names_to = "total_inc_qtile_lead5", values_to = "n") %>% 
#   mutate(total_inc_qtile_lead5 = total_inc_qtile_lead5 %>% str_replace("lead5_", "")) %>% 
#   ## those missing n are in highest age group. Replace w 0 for now
#   mutate(n = ifelse(is.na(n), 0, n))
# 
# ## plot
# hilda_grossinc_trans_age_plot5_w18 <- ggplot(hilda_inc_trans_age_summary5_w18 %>% 
#                                                filter(!(age_grp %in% c("[0,5)", "[5,10)", "[100,105]"))),
#                                              aes(y = n, axis1 = total_inc_qtile, axis2 = total_inc_qtile_lead5)) +
#   geom_alluvium(aes(fill = total_inc_qtile)) +
#   geom_stratum(width = 1/12, fill= "black", col="white") +
#   geom_label(stat="stratum", aes(label = after_stat(stratum))) +
#   scale_x_discrete(limits = c("Original", "5 years later")) +
#   facet_wrap(vars(age_grp), ncol=2 , scales="free")
# 
# 
# # Check: Probability of transitioning at all -------------------------------------
# 
# ## what proportion of age/inc group transition at all to a diff inc group
# hilda_inc_trans_atall_age_summary5_av <- hilda_inc_trans_age_summary5_av %>% 
#   mutate(inc_trans = ifelse(total_inc_qtile==total_inc_qtile_lead5, 0, 1)) %>% 
#   group_by(age_grp, total_inc_qtile, inc_trans) %>% 
#   mutate(transprob =sum(avprob)) 
# 
# ## what prop of age group only transition to a diff inc group, when probabilities applied to 2018 data?
# hilda_inc_trans_atall_summary5_av <- hilda_inc_trans_age_summary5_w18 %>% 
#   ## transition identifier
#   mutate(inc_trans = ifelse(total_inc_qtile==total_inc_qtile_lead5, 0, 1)) %>% 
#   group_by(age_grp, inc_trans) %>% 
#   summarise(n = sum(n)) %>% 
#   group_by(age_grp) %>% 
#   mutate(transprob = n/sum(n)) %>% 
#   filter(inc_trans == 1)
# 
# hilda_inc_trans_atall_summary5_av_plot <- ggplot(hilda_inc_trans_atall_summary5_av) +
#   geom_col(aes(x = age_grp, y=transprob)) +
#   geom_hline(yintercept=0.45)
# 
# 
# 
# ## based on above, suppose we want to allow transitions only from ages 15-35, and 55-70. (threshold of >45% of age group in 2018 transitioning)
# ## is the distribution of incomes at other ages roughly the same? yes
# 
# ##  data on  proportion of ppl in each age grp and inc grp 
# hilda_inc_trans_atall_summary5_av_2 <- hilda_inc_trans_age_summary5_w18 %>% 
#     ## what proportion are there in each age grp and inc grp over time
#     group_by(age_grp, total_inc_qtile) %>% 
#     summarise(n_inc_qtile = sum(n)) %>% 
#   group_by(age_grp) %>% 
#   mutate(prob_inc_qtile = n_inc_qtile/sum(n_inc_qtile)) %>% 
#   select(-n_inc_qtile) #%>% 
#   #pivot_wider(names_from = total_inc_qtile, values_from = prob_inc_qtile)
# 
# hilda_inc_trans_atall_summary5_av_2_plot <- ggplot(hilda_inc_trans_atall_summary5_av_2) +
#   geom_col(aes(x=age_grp, y=prob_inc_qtile, fill=total_inc_qtile))